Main analysis
Performance as a function of number of repetitions Minimum number of repetitions: 5, minimum number of weeks: 5
Cognition (SDMT)
params = list(
test_code = "ips",
test_metric_code = "correct_responses",
unit = "SDMT: Correct Responses",
unit_n = "patient",
unit_time = "repetition",
min_repetitions = 5,
min_weeks = 5,
predictor = "repetition",
xlab = "Repetitions",
bounded.growth.confidence.interval = T,
up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model
# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)
# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)
# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
library(tidyr) # pivot_wider
# just one means either "hand" or "successful_pinches" values are missing, remove those
table(table(data$time))
data = data[!data$time %in% names(which(table(data$time)==1)), ]
data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
data = data[data$testMetricCode == params$test_metric_code,]
data$hand_used = NA
data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}
colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]
# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]
# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA
data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)
data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)
no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())
linecolor = "#c71138"
Participant selection
# At least x weeks & repetitions
for (id in unique(data$id)) {
subset = data$id == id
n = sum(subset)
data[subset, "repetition"] = (1:n)-1
data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}
n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))
participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")
Among the total n=1095 patients with n=5715 repetitions, the median length of participation is 0.0 weeks (IQR 0.0-8.6, range 0.0-151.6) and the median number of repetitions is 1 (IQR 1-4, range 1-106).
data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]
for (id in unique(data$id)) {
subset = data$id == id
n = sum(subset)
data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}
participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")
data$predictor = data[,params$predictor]
Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=251 / 1095 patients and n=4388 / 5715 tests. Among those, the median length of participation is 16.5 weeks (IQR 10.1-45.9, range 5.0-151.6) and the median number of repetitions is 11 (IQR 6.5-18, range 5-106).
t(data.frame(
n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
## [,1]
## n_patients "251 / 1095 (22.9%)"
## n_hands "251 / 1095 (22.9%)"
## n_tests "4388 / 5715 (76.8%)"
## percent_female "70.5"
## age "50.1 (42.0-58.0, range 20.0-79.0)"
## repetitions "11 repetitions (IQR 6.5-18, range 5-106)"
## median_intertest_interval "7.6 days (IQR 7.1-9.5, range 6.7-87.1)"
## IQR_intertest_interval "2.8 days (IQR 0.7-8.1, range 0.0-133.8)"
## weeks "16.5 weeks (IQR 10.1-45.9, range 5.0-151.6)"
Summary level analysis
Difference test
df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))
df$predictor = df[, params$predictor]
test = t.test(df$last, df$first, paired=T)
test
##
## Paired t-test
##
## data: df$last and df$first
## t = 21.067, df = 250, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 9.086812 10.960997
## sample estimates:
## mean of the differences
## 10.0239
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
##
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.4317 -4.4998 -0.0443 4.4295 24.4693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.1237 3.8580 6.253 1.76e-09 ***
## I(mean_age/10) -0.4513 0.4952 -0.911 0.363
## I(first/10) -3.0882 0.4879 -6.329 1.15e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.92 on 247 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1671, Adjusted R-squared: 0.1604
## F-statistic: 24.78 on 2 and 247 DF, p-value: 1.555e-10
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 13.019407 27.3652977
## I(mean_age/10) -2.076327 -0.2111537
## I(first/10) -4.304726 -2.5062707
## log10(predictor) 5.412423 10.3436471
summ = summary(mod)
summ
##
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2859 -3.7000 -0.1442 4.1498 21.8293
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.1924 3.6417 5.545 7.58e-08 ***
## I(mean_age/10) -1.1437 0.4735 -2.416 0.0164 *
## I(first/10) -3.4055 0.4565 -7.459 1.49e-12 ***
## log10(predictor) 7.8780 1.2518 6.293 1.41e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.435 on 246 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2826, Adjusted R-squared: 0.2739
## F-statistic: 32.31 on 3 and 246 DF, p-value: < 2.2e-16
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)
print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 26.1 (23.7-28.6)"
lab.y = 1.1*mean(df$last)
p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))
p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")
(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")
Confounders
p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")
p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")
p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")
p_first_pred = ggscatter(df, "first", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")
p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")
p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_last_pred = ggscatter(df, "last", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")
#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")
#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)
m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
p_last, p_last_diff+ylab(""),
p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))
Learning curve: Model selection
smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
## Data: strip.offset(mf)
## AIC BIC logLik
## 24969.01 25000.94 -12479.5
##
## Random effects:
## Formula: ~Xr - 1 | g
## Structure: pdIdnot
## Xr1 Xr2 Xr3 Xr4 Xr5 Xr6 Xr7 Xr8
## StdDev: 5.6785 5.6785 5.6785 5.6785 5.6785 5.6785 5.6785 5.6785
##
## Formula: ~1 | id %in% g
## (Intercept) Residual
## StdDev: 9.459975 3.642742
##
## Fixed effects: y ~ X - 1
## Value Std.Error DF t-value p-value
## X(Intercept) 47.92484 0.60361 4136 79.39700 0
## Xs(predictor)Fx1 33.02793 8.11804 4136 4.06846 0
## Correlation:
## X(Int)
## Xs(predictor)Fx1 0.003
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -7.78184177 -0.53101332 0.05473965 0.59964708 4.47513183
##
## Number of Observations: 4388
## Number of Groups:
## g id %in% g
## 1 251
summary(smoothing_spline$gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## value ~ s(predictor, bs = "ps")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.9248 0.6035 79.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(predictor) 8.14 8.14 365.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.126
## Scale est. = 13.27 n = 4388
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
## Data: data
##
## REML criterion at convergence: 25903.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.3068 -0.5274 0.0905 0.6226 4.1418
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 90.20 9.497
## Residual 16.83 4.102
## Number of obs: 4388, groups: id, 251
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 44.088932 0.606428 72.70
## predictor 0.183064 0.004772 38.36
##
## Correlation of Fixed Effects:
## (Intr)
## predictor -0.065
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
## Data: data
##
## REML criterion at convergence: 25530.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.5414 -0.5298 0.0717 0.6076 4.0476
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 89.99 9.486
## Residual 15.32 3.915
## Number of obs: 4388, groups: id, 251
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 42.9686500 0.6076484 70.71
## predictor 0.3925009 0.0113252 34.66
## I(predictor^2) -0.0030669 0.0001518 -20.20
##
## Correlation of Fixed Effects:
## (Intr) prdctr
## predictor -0.109
## I(prdctr^2) 0.091 -0.916
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |
## id)
## Data: data
##
## AIC BIC logLik deviance df.resid
## 25040.5 25078.8 -12514.3 25028.5 4382
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.0591 -0.5246 0.0596 0.5963 3.8253
##
## Random effects:
## Groups Name Variance Std.Dev.
## id y0 95.33 9.764
## id.1 yf 171.06 13.079
## Residual 12.22 3.495
## Number of obs: 4388, groups: id, 251
##
## Fixed effects:
## Estimate Std. Error t value
## yf 56.74459 1.03625 54.76
## y0 40.65554 0.63254 64.27
## log_alpha -2.59218 0.04834 -53.62
##
## Correlation of Fixed Effects:
## yf y0
## y0 -0.016
## log_alpha -0.454 -0.084
exp(log_alpha)
## log_alpha
## 0.0748565
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline: 39.57408
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 3.984912 3.801991 3.535307 3.314229
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 2.977195 2.828032 2.619453 2.497699
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
## [,1] [,2] [,3] [,4]
## [1,] 4.00 5.00 5.00 6.00
## [2,] 25903.69 25516.28 24969.01 25040.51
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.139569 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)
Plot
participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))
xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]
range_90p =quantile(data$value, probs=c(0.05,0.95))
p1 = ggplot() +
geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) +
stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) +
stat_function(fun=equ_bounded, color=linecolor, size=1) +
theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")
p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
geom_text(aes(x=x,y="A",label=text), family="Serif") +
theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
scale_x_continuous(breaks=seq(0,xmax,by=10))
(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 302 row(s) containing missing values (geom_path).
if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
## Computing profile confidence intervals ...
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
if (params$bounded.growth.confidence.interval) {
print(conf)
print(exp(conf[3,]))
print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}
## 2.5 % 97.5 %
## yf 54.629934 59.030012
## y0 39.403849 41.906512
## log_alpha -2.726682 -2.461826
## 2.5 % 97.5 %
## 0.06543607 0.08527909
## [1] "Average boundary improvement over baseline: 39.6 (38.6-40.9)"
Bounded growth model
equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)
equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))
growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
x=selected_timepoints,
y=equ_bounded(selected_timepoints),
label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))
if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))
quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 4388 (n patients = 251, median tests per patient: 11, IQR 6.5-18)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
stat_function(fun=equ_bounded, color=linecolor, size=1) +
geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")
if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)
p1
## Warning: Removed 91 row(s) containing missing values (geom_path).
Quantile regression
if (is.null(params$censor_after)) {
censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]
percentiles = c(0.05,0.25,0.5,0.75,0.95)
QR = rq(value ~ predictor, tau=percentiles, data_censored)
p_vals = sapply(1:length(summary(QR)), function(i) {
summ = coef(summary(QR, se="ker")[[i]])
print(summ)
print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
summ[2,4]
})
## Value Std. Error t value Pr(>|t|)
## (Intercept) 22.500000 1.0682255 21.062969 0.000000e+00
## predictor 1.166667 0.2014399 5.791636 8.016017e-09
## [1] "Intercept: 22.5 (20.4-24.6), beta: 1.17 (0.77-1.56)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 35.0000000 0.5341734 65.521795 0.00000e+00
## predictor 0.8333333 0.1059255 7.867164 5.77316e-15
## [1] "Intercept: 35.0 (34.0-36.0), beta: 0.83 (0.63-1.04)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 41 0.4823704 84.996921 0
## predictor 1 0.1047662 9.545067 0
## [1] "Intercept: 41.0 (40.1-41.9), beta: 1.00 (0.79-1.21)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 47 0.5122156 91.758239 0
## predictor 1 0.1087158 9.198297 0
## [1] "Intercept: 47.0 (46.0-48.0), beta: 1.00 (0.79-1.21)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 55.875 0.7185765 77.757901 0.000000e+00
## predictor 1.125 0.1737523 6.474736 1.178009e-10
## [1] "Intercept: 55.9 (54.5-57.3), beta: 1.12 (0.78-1.47)"
p_vals = p.adjust(p_vals, method="bonferroni")
ANOVA = anova(QR)
quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 2111 (median tests per patient: 10, IQR 6.5-10)"
signif_p = function(x, digits=1) {
x = signif(x, digits)
if (as.character(x) == "0") return("< 2e-16")
else return(paste0("= ", x))
}
ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
coord_cartesian(xlim=c(0,censor_after)) +
geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
geom_vline(xintercept=censor_after, color=linecolor, linetype=2)
Dexterity (Finger Pinching)
params = list(
test_code = "pinching",
test_metric_code = "successful_pinches",
unit = "Pinching: Successful Pinches",
unit_n = "hand",
unit_time = "repetition",
min_repetitions = 5,
min_weeks = 5,
predictor = "repetition",
xlab = "Repetitions",
bounded.growth.confidence.interval = T,
up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model
# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)
# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)
# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
library(tidyr) # pivot_wider
# just one means either "hand" or "successful_pinches" values are missing, remove those
table(table(data$time))
data = data[!data$time %in% names(which(table(data$time)==1)), ]
data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
data = data[data$testMetricCode == params$test_metric_code,]
data$hand_used = NA
data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}
colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]
# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]
# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA
data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)
data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)
no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())
linecolor = "#c71138"
Participant selection
# At least x weeks & repetitions
for (id in unique(data$id)) {
subset = data$id == id
n = sum(subset)
data[subset, "repetition"] = (1:n)-1
data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}
n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))
participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")
Among the total n=1816 patients with n=20695 repetitions, the median length of participation is 0.6 weeks (IQR 0.0-8.1, range 0.0-151.7) and the median number of repetitions is 2 (IQR 1-7, range 1-370).
data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]
for (id in unique(data$id)) {
subset = data$id == id
n = sum(subset)
data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}
participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")
data$predictor = data[,params$predictor]
Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=252 / 1059 patients , 470 / 1816 hands and n=17945 / 20695 tests. Among those, the median length of participation is 15.6 weeks (IQR 9.9-43.4, range 5.0-151.7) and the median number of repetitions is 18 (IQR 10-42.75, range 5-370).
t(data.frame(
n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
## [,1]
## n_patients "252 / 1059 (23.8%)"
## n_hands "470 / 1816 (25.9%)"
## n_tests "17945 / 20695 (86.7%)"
## percent_female "71.7"
## age "49.8 (41.7-57.5, range 20.0-79.0)"
## repetitions "18 repetitions (IQR 10-42.75, range 5-370)"
## median_intertest_interval "3.1 days (IQR 2.1-4.9, range 1.9-39.2)"
## IQR_intertest_interval "3.0 days (IQR 1.0-7.0, range 0.0-101.5)"
## weeks "15.6 weeks (IQR 9.9-43.4, range 5.0-151.7)"
Summary level analysis
Difference test
df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))
df$predictor = df[, params$predictor]
test = t.test(df$last, df$first, paired=T)
test
##
## Paired t-test
##
## data: df$last and df$first
## t = 21.525, df = 469, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 13.41214 16.10701
## sample estimates:
## mean of the differences
## 14.75957
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
##
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.110 -9.844 0.538 10.620 32.248
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.9357 3.6391 9.325 <2e-16 ***
## I(mean_age/10) -1.1000 0.6020 -1.827 0.0683 .
## I(first/10) -5.2868 0.5221 -10.127 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.48 on 465 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.182, Adjusted R-squared: 0.1785
## F-statistic: 51.73 on 2 and 465 DF, p-value: < 2.2e-16
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 15.688484 29.150318
## I(mean_age/10) -3.597866 -1.420346
## I(first/10) -6.350855 -4.514504
## log10(predictor) 11.618785 16.777621
summ = summary(mod)
summ
##
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.795 -7.489 0.957 8.396 27.821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.4194 3.4252 6.545 1.57e-10 ***
## I(mean_age/10) -2.5091 0.5541 -4.529 7.56e-06 ***
## I(first/10) -5.4327 0.4672 -11.627 < 2e-16 ***
## log10(predictor) 14.1982 1.3126 10.817 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.06 on 464 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3467, Adjusted R-squared: 0.3425
## F-statistic: 82.09 on 3 and 464 DF, p-value: < 2.2e-16
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)
print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 56.6 (51.4-61.8)"
lab.y = 1.1*mean(df$last)
p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))
p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")
(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")
Confounders
p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")
p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")
p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")
p_first_pred = ggscatter(df, "first", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")
p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")
p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_last_pred = ggscatter(df, "last", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")
#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")
#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)
m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
p_last, p_last_diff+ylab(""),
p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))
Learning curve: Model selection
smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
## Data: strip.offset(mf)
## AIC BIC logLik
## 125820.5 125859.5 -62905.26
##
## Random effects:
## Formula: ~Xr - 1 | g
## Structure: pdIdnot
## Xr1 Xr2 Xr3 Xr4 Xr5 Xr6 Xr7 Xr8
## StdDev: 7.882111 7.882111 7.882111 7.882111 7.882111 7.882111 7.882111 7.882111
##
## Formula: ~1 | id %in% g
## (Intercept) Residual
## StdDev: 11.77696 7.63915
##
## Fixed effects: y ~ X - 1
## Value Std.Error DF t-value p-value
## X(Intercept) 42.46120 0.555801 17474 76.39641 0
## Xs(predictor)Fx1 45.84515 9.671501 17474 4.74023 0
## Correlation:
## X(Int)
## Xs(predictor)Fx1 0.006
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -6.27200948 -0.49795419 0.07459264 0.60154891 7.10975647
##
## Number of Observations: 17945
## Number of Groups:
## g id %in% g
## 1 470
summary(smoothing_spline$gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## value ~ s(predictor, bs = "ps")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.4612 0.5558 76.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(predictor) 8.352 8.352 636.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.209
## Scale est. = 58.357 n = 17945
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
## Data: data
##
## REML criterion at convergence: 128354.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.2240 -0.4944 0.0992 0.6193 5.7347
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 146.79 12.116
## Residual 67.56 8.219
## Number of obs: 17945, groups: id, 470
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 35.967096 0.568250 63.30
## predictor 0.066612 0.001424 46.78
##
## Correlation of Fixed Effects:
## (Intr)
## predictor -0.048
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
## Data: data
##
## REML criterion at convergence: 127487.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.2080 -0.5027 0.0813 0.6176 6.3466
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 141.93 11.913
## Residual 64.28 8.017
## Number of obs: 17945, groups: id, 470
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 3.477e+01 5.600e-01 62.08
## predictor 1.582e-01 3.339e-03 47.38
## I(predictor^2) -3.698e-04 1.226e-05 -30.16
##
## Correlation of Fixed Effects:
## (Intr) prdctr
## predictor -0.084
## I(prdctr^2) 0.071 -0.909
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Nonlinear mixed model fit by maximum likelihood ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |
## id)
## Data: data
##
## AIC BIC logLik deviance df.resid
## 123112.3 123159.1 -61550.2 123100.3 17939
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.9910 -0.4896 0.0740 0.6091 5.5039
##
## Random effects:
## Groups Name Variance Std.Dev.
## id y0 157.7 12.56
## id.1 yf 639.7 25.29
## Residual 47.6 6.90
## Number of obs: 17945, groups: id, 470
##
## Fixed effects:
## Estimate Std. Error t value
## yf 57.63459 1.55630 37.03
## y0 31.00648 0.59943 51.73
## log_alpha -3.65092 0.03953 -92.35
##
## Correlation of Fixed Effects:
## yf y0
## y0 -0.034
## log_alpha -0.415 -0.103
exp(log_alpha)
## log_alpha
## 0.02596735
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline: 85.87919
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 8.114177 7.914272 7.539605 6.749939
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 6.033296 5.873121 5.558197 4.998443
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
## [,1] [,2] [,3] [,4]
## [1,] 4.0 5.0 5.0 6.0
## [2,] 128351.9 127466.3 125820.5 123112.3
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.352153 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)
Plot
participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))
xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]
range_90p =quantile(data$value, probs=c(0.05,0.95))
p1 = ggplot() +
geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) +
stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) +
stat_function(fun=equ_bounded, color=linecolor, size=1) +
theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")
p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
geom_text(aes(x=x,y="A",label=text), family="Serif") +
theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
scale_x_continuous(breaks=seq(0,xmax,by=10))
(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 964 row(s) containing missing values (geom_path).
if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
## Computing profile confidence intervals ...
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
if (params$bounded.growth.confidence.interval) {
print(conf)
print(exp(conf[3,]))
print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}
## 2.5 % 97.5 %
## yf 54.639426 60.80591
## y0 29.828866 32.18362
## log_alpha -3.729284 -3.57407
## 2.5 % 97.5 %
## 0.02401002 0.02804148
## [1] "Average boundary improvement over baseline: 85.9 (83.2-88.9)"
Bounded growth model
equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)
equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))
growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
x=selected_timepoints,
y=equ_bounded(selected_timepoints),
label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))
if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))
quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 17945 (n hands = 470, median tests per hand: 18, IQR 10-42.75)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
stat_function(fun=equ_bounded, color=linecolor, size=1) +
geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")
if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)
p1
## Warning: Removed 480 row(s) containing missing values (geom_path).
Quantile regression
if (is.null(params$censor_after)) {
censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]
percentiles = c(0.05,0.25,0.5,0.75,0.95)
QR = rq(value ~ predictor, tau=percentiles, data_censored)
p_vals = sapply(1:length(summary(QR)), function(i) {
summ = coef(summary(QR, se="ker")[[i]])
print(summ)
print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
summ[2,4]
})
## Value Std. Error t value Pr(>|t|)
## (Intercept) 8.4545455 0.49444260 17.099145 0.000000e+00
## predictor 0.2727273 0.03885682 7.018775 2.408296e-12
## [1] "Intercept: 8.5 (7.5-9.4), beta: 0.27 (0.20-0.35)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 20.8571429 0.34196863 60.99139 0
## predictor 0.5714286 0.02945366 19.40094 0
## [1] "Intercept: 20.9 (20.2-21.5), beta: 0.57 (0.51-0.63)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 30.70 0.36373533 84.40203 0
## predictor 0.65 0.02542666 25.56372 0
## [1] "Intercept: 30.7 (30.0-31.4), beta: 0.65 (0.60-0.70)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 42.52 0.3733377 113.89154 0
## predictor 0.48 0.0252041 19.04452 0
## [1] "Intercept: 42.5 (41.8-43.3), beta: 0.48 (0.43-0.53)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 54.0769231 0.43694387 123.76172 0
## predictor 0.3846154 0.03197911 12.02708 0
## [1] "Intercept: 54.1 (53.2-54.9), beta: 0.38 (0.32-0.45)"
p_vals = p.adjust(p_vals, method="bonferroni")
ANOVA = anova(QR)
quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 8593 (median tests per hand: 18, IQR 10-28)"
signif_p = function(x, digits=1) {
x = signif(x, digits)
if (as.character(x) == "0") return("< 2e-16")
else return(paste0("= ", x))
}
ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
coord_cartesian(xlim=c(0,censor_after)) +
geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
geom_vline(xintercept=censor_after, color=linecolor, linetype=2)
Mobility (Two Minute Walk)
params = list(
test_code = "two_min_walk",
test_metric_code = "steps",
unit = "Two Minute Walk: Steps",
unit_n = "patient",
unit_time = "repetition",
min_repetitions = 5,
min_weeks = 5,
predictor = "repetition",
xlab = "Repetitions",
bounded.growth.confidence.interval = F,
censor_after = 9, # allow comparison with cognition
up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model
# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)
# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)
# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
library(tidyr) # pivot_wider
# just one means either "hand" or "successful_pinches" values are missing, remove those
table(table(data$time))
data = data[!data$time %in% names(which(table(data$time)==1)), ]
data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
data = data[data$testMetricCode == params$test_metric_code,]
data$hand_used = NA
data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}
colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]
# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]
# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA
data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)
data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)
no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())
linecolor = "#c71138"
Participant selection
# At least x weeks & repetitions
for (id in unique(data$id)) {
subset = data$id == id
n = sum(subset)
data[subset, "repetition"] = (1:n)-1
data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}
n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))
participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")
Among the total n=540 patients with n=14031 repetitions, the median length of participation is 0.9 weeks (IQR 0.0-11.2, range 0.0-133.5) and the median number of repetitions is 3 (IQR 1-14.25, range 1-735).
data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]
for (id in unique(data$id)) {
subset = data$id == id
n = sum(subset)
data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}
participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")
data$predictor = data[,params$predictor]
Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=161 / 540 patients and n=12997 / 14031 tests. Among those, the median length of participation is 16.4 weeks (IQR 10.8-46.4, range 5.3-133.5) and the median number of repetitions is 37 (IQR 15-84, range 5-735).
t(data.frame(
n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
## [,1]
## n_patients "161 / 540 (29.8%)"
## n_hands "161 / 540 (29.8%)"
## n_tests "12997 / 14031 (92.6%)"
## percent_female "72.0"
## age "50.0 (41.7-58.0, range 20.0-74.3)"
## repetitions "37 repetitions (IQR 15-84, range 5-735)"
## median_intertest_interval "1.3 days (IQR 1.0-2.9, range 1.0-24.9)"
## IQR_intertest_interval "1.9 days (IQR 0.7-4.6, range 0.1-39.2)"
## weeks "16.4 weeks (IQR 10.8-46.4, range 5.3-133.5)"
Summary level analysis
Difference test
df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))
df$predictor = df[, params$predictor]
test = t.test(df$last, df$first, paired=T)
test
##
## Paired t-test
##
## data: df$last and df$first
## t = 0.61336, df = 160, p-value = 0.5405
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.922237 9.357020
## sample estimates:
## mean of the differences
## 2.217391
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
##
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133.799 -19.684 6.243 23.605 141.065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.5479 25.3623 2.111 0.036329 *
## I(mean_age/10) 0.5238 3.3466 0.157 0.875817
## I(first/10) -2.6593 0.7265 -3.660 0.000344 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.24 on 157 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.08709, Adjusted R-squared: 0.07546
## F-statistic: 7.488 on 2 and 157 DF, p-value: 0.0007829
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 0.05856387 101.087320
## I(mean_age/10) -7.32284922 6.491036
## I(first/10) -4.28838819 -1.341943
## log10(predictor) -7.57631209 21.101800
summ = summary(mod)
summ
##
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -134.951 -19.355 5.857 23.186 133.001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.5729 25.5732 1.978 0.049738 *
## I(mean_age/10) -0.4159 3.4967 -0.119 0.905473
## I(first/10) -2.8152 0.7458 -3.775 0.000227 ***
## log10(predictor) 6.7627 7.2592 0.932 0.352979
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.26 on 156 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.09214, Adjusted R-squared: 0.07468
## F-statistic: 5.277 on 3 and 156 DF, p-value: 0.001711
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)
print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 1.1 (-2.4-4.6)"
lab.y = 1.1*mean(df$last)
p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))
p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")
(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")
Confounders
p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")
p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")
p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")
p_first_pred = ggscatter(df, "first", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")
p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")
p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)
p_last_pred = ggscatter(df, "last", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")
p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")
#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")
#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)
m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
p_last, p_last_diff+ylab(""),
p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))
Learning curve: Model selection
smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
## Data: strip.offset(mf)
## AIC BIC logLik
## 122134.3 122171.7 -61062.17
##
## Random effects:
## Formula: ~Xr - 1 | g
## Structure: pdIdnot
## Xr1 Xr2 Xr3 Xr4 Xr5 Xr6 Xr7 Xr8
## StdDev: 2.966859 2.966859 2.966859 2.966859 2.966859 2.966859 2.966859 2.966859
##
## Formula: ~1 | id %in% g
## (Intercept) Residual
## StdDev: 46.81943 25.74474
##
## Fixed effects: y ~ X - 1
## Value Std.Error DF t-value p-value
## X(Intercept) 205.30328 3.727371 12835 55.07992 0.0000
## Xs(predictor)Fx1 -14.25125 9.018131 12835 -1.58029 0.1141
## Correlation:
## X(Int)
## Xs(predictor)Fx1 0.015
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -11.30836268 -0.27957744 0.09322983 0.42742641 6.69782984
##
## Number of Observations: 12997
## Number of Groups:
## g id %in% g
## 1 161
summary(smoothing_spline$gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## value ~ s(predictor, bs = "ps")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 205.303 3.727 55.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(predictor) 5.879 5.879 4.502 0.000127 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00781
## Scale est. = 662.79 n = 12997
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
## Data: data
##
## REML criterion at convergence: 122140.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -11.2951 -0.2782 0.0944 0.4320 6.6383
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 2218.6 47.10
## Residual 664.3 25.77
## Number of obs: 12997, groups: id, 161
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 205.069823 3.737751 54.864
## predictor -0.002631 0.002544 -1.034
##
## Correlation of Fixed Effects:
## (Intr)
## predictor -0.027
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
## Data: data
##
## REML criterion at convergence: 122153.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -11.2977 -0.2803 0.0963 0.4302 6.6705
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 2209 47.00
## Residual 664 25.77
## Number of obs: 12997, groups: id, 161
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 2.047e+02 3.733e+00 54.830
## predictor 1.217e-02 5.924e-03 2.055
## I(predictor^2) -3.029e-05 1.095e-05 -2.767
##
## Correlation of Fixed Effects:
## (Intr) prdctr
## predictor -0.047
## I(prdctr^2) 0.039 -0.903
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |
## id)
## Data: data
##
## AIC BIC logLik deviance df.resid
## 120864.4 120909.2 -60426.2 120852.4 12991
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -12.0839 -0.2904 0.0757 0.4076 5.1580
##
## Random effects:
## Groups Name Variance Std.Dev.
## id y0 2528.6 50.28
## id.1 yf 3967.8 62.99
## Residual 584.8 24.18
## Number of obs: 12997, groups: id, 161
##
## Fixed effects:
## Estimate Std. Error t value
## yf 207.27795 5.81502 35.65
## y0 205.01053 4.02191 50.97
## log_alpha -3.84930 0.06442 -59.76
##
## Correlation of Fixed Effects:
## yf y0
## y0 -0.052
## log_alpha 0.007 -0.005
## convergence code: 0
## failure to converge in 10000 evaluations
exp(log_alpha)
## log_alpha
## 0.02129461
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline: 1.105999
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 25.61528 25.60831 25.58158 23.92620
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 15.89611 15.89554 15.85751 14.71605
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
## [,1] [,2] [,3] [,4]
## [1,] 4.0 5.0 5.0 6.0
## [2,] 122142.8 122137.2 122134.3 120864.4
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 6.879167 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)
Plot
participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))
xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]
range_90p =quantile(data$value, probs=c(0.05,0.95))
p1 = ggplot() +
geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) +
stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) +
stat_function(fun=equ_bounded, color=linecolor, size=1) +
theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")
p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
geom_text(aes(x=x,y="A",label=text), family="Serif") +
theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
scale_x_continuous(breaks=seq(0,xmax,by=10))
(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 1144 row(s) containing missing values (geom_path).
if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
print(conf)
print(exp(conf[3,]))
print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}
Bounded growth model
equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)
equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))
growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
x=selected_timepoints,
y=equ_bounded(selected_timepoints),
label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))
if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))
quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 12997 (n patients = 161, median tests per patient: 37, IQR 15-84)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
stat_function(fun=equ_bounded, color=linecolor, size=1) +
geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")
if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)
p1
## Warning: Removed 852 row(s) containing missing values (geom_path).
Quantile regression
if (is.null(params$censor_after)) {
censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]
percentiles = c(0.05,0.25,0.5,0.75,0.95)
QR = rq(value ~ predictor, tau=percentiles, data_censored)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
p_vals = sapply(1:length(summary(QR)), function(i) {
summ = coef(summary(QR, se="ker")[[i]])
print(summ)
print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
summ[2,4]
})
## Value Std. Error t value Pr(>|t|)
## (Intercept) 99 12.634385 7.8357592 8.437695e-15
## predictor -1 2.488586 -0.4018346 6.878606e-01
## [1] "Intercept: 99.0 (74.2-123.8), beta: -1.00 (-5.88-3.88)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 181 4.2033466 43.060927 0.0000000
## predictor 1 0.7578666 1.319493 0.1871971
## [1] "Intercept: 181.0 (172.8-189.2), beta: 1.00 (-0.49-2.49)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 217.5 2.3848809 91.199525 0.0000000
## predictor 0.5 0.4438119 1.126603 0.2600828
## [1] "Intercept: 217.5 (212.8-222.2), beta: 0.50 (-0.37-1.37)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 237.0 2.0164631 117.532525 0.0000000
## predictor 0.6 0.3844522 1.560662 0.1188055
## [1] "Intercept: 237.0 (233.0-241.0), beta: 0.60 (-0.15-1.35)"
## Value Std. Error t value Pr(>|t|)
## (Intercept) 262.0 2.1601758 121.286426 0.0000000
## predictor 0.5 0.4307378 1.160799 0.2459006
## [1] "Intercept: 262.0 (257.8-266.2), beta: 0.50 (-0.34-1.34)"
p_vals = p.adjust(p_vals, method="bonferroni")
ANOVA = anova(QR)
quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 1568 (median tests per patient: 10, IQR 10-10)"
signif_p = function(x, digits=1) {
x = signif(x, digits)
if (as.character(x) == "0") return("< 2e-16")
else return(paste0("= ", x))
}
ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
coord_cartesian(xlim=c(0,censor_after)) +
geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
geom_vline(xintercept=censor_after, color=linecolor, linetype=2)